April 5, 2021
#@title
# How to update the data:
# 1. You may need to copy this colab so you have your own version.
# 2. Update the table name constants below to have the latest data's suffix.
# 3. Update the date variables below to be the last case date included in the data.
# 4. Updates the scatterplot max/min below in chart settings may need to be updated for more cases.
# 5. There are a few checks for the county_fips_mapping that we created due to issues with the CDC's.
# Instructions are at https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006.
# 6. Many/all of the tables and text are not auto-updated. If you want to do a full updated of
# the paper including text and tables, a lot of that is done in commented out PrintSummaryStats() statements.
import pandas as pd
import altair as alt
from vega_datasets import data
from google.colab import auth
auth.authenticate_user()
# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
# Project and table names.
PROJECT_ID = 'msm-secure-data-1b'
CDC_TABLE = '`%s.ndunlap_secure.cdc_restricted_access_20210331`' % PROJECT_ID
CDC_PUBLIC_GEO_TABLE = '`%s.ndunlap_secure.cdc_public_use_with_geo_20210331`' % PROJECT_ID
CRDT_TABLE = '`%s.ndunlap_secure.crdt_20210307`' % PROJECT_ID
CREW_TABLE = '`msm-internal-data.crew.covid_case_surveillance`'
# Dates in different formats.
DATE = 'DATE(2021, 03, 16)'
DATE_DISPLAY_NAME = 'Mar 16'
CRDT_DATE = '20210307'
CRDT_COMPARE_DATE = 'DATE(2021, 03, 07)'
CRDT_COMPARE_DATE_DISPLAY_NAME = 'Mar 7'
TOTAL_CRDT_CASES = 28756184 # from https://covidtracking.com/data/national
# Set the scatterplot max/min to better handle outliers (CA, Los Angeles).
TOTAL_CASES_SCALE_MAX = 4000000
COUNTY_CASES_SCALE_MAX = 1400000
COUNTY_CASES_ZOOM_SCALE_MAX = 200000
CASES_RACE_SCALE_MAX = 3000000 # known race/ethnicity
COUNTY_CASES_RACE_SCALE_MAX = 400000 # known race/ethnicity
# Chart settings.
SCATTER_HEIGHT = 300
SCATTER_WIDTH = 300
MAP_HEIGHT = 300
MAP_WIDTH = 450
US_STATES_TOPO = alt.topo_feature(data.us_10m.url, 'states')
US_COUNTIES_TOPO = alt.topo_feature(data.us_10m.url+"#", 'counties')
TERRITORIES = ('PR', 'GU', 'VI', 'MP', 'AS')
NYT_TERRITORIES = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
STATES_TO_FIPS = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12, 'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23, 'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34, 'NM': 35, 'NY': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56, 'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78, 'NYC': 36}
FIPS_TO_STATES = {STATES_TO_FIPS[key]: key for key in STATES_TO_FIPS}
RACE_ETHNICITY_COMBINED_MAP = {
'Asian, Non-Hispanic': 'asian_cases',
'Black, Non-Hispanic': 'black_cases',
'White, Non-Hispanic': 'white_cases',
'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
'Hispanic/Latino': 'hispanic_cases',
'Multiple/Other, Non-Hispanic': 'other_cases',
'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
'Missing': 'unknown_cases',
'Unknown': 'unknown_cases',
'NA': 'na_cases',
}
RACE_ETHNICITY_COMBINED_KNOWN_MAP = {
'Asian, Non-Hispanic': 'cdc_known_cases',
'Black, Non-Hispanic': 'cdc_known_cases',
'White, Non-Hispanic': 'cdc_known_cases',
'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
'Hispanic/Latino': 'cdc_known_cases',
'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
'Missing': 'cdc_unknown_cases',
'Unknown': 'cdc_unknown_cases',
'NA': 'cdc_na_cases',
}
#@title
CDC_OVERALL_RACE_QUERY = ('''
SELECT
race_ethnicity_combined,
COUNT(*) as cases
FROM
%s
GROUP BY
1
''' % CDC_TABLE)
CRDT_QUERY = ('''
SELECT
State as state,
Cases_Total as crdt_cases,
Cases_Total - Cases_Unknown as crdt_known_race_cases,
ROUND(1 - Cases_Unknown / Cases_Total, 4) as crdt_known_race_cases_percent,
Cases_Total - Cases_Ethnicity_Unknown as crdt_known_ethnicity_cases,
ROUND(1 - Cases_Ethnicity_Unknown / Cases_Total, 4) as crdt_known_ethnicity_cases_percent,
FROM %s
WHERE
date = %s
''' % (CRDT_TABLE, CRDT_DATE))
NYT_STATES_QUERY_STR = ('''
SELECT
state_name,
state_fips_code,
confirmed_cases as nyt_cases,
deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
date = %s AND
state_fips_code IS NOT NULL
''')
NYT_STATES_QUERY = NYT_STATES_QUERY_STR % DATE
NYT_STATES_COMPARE_CRDT_QUERY = NYT_STATES_QUERY_STR % CRDT_COMPARE_DATE
CDC_STATES_QUERY = ('''
SELECT
res_state,
COUNT(*) as cdc_cases
FROM
%s
GROUP BY
res_state
''' % CDC_TABLE)
NYT_COUNTIES_QUERY = ('''
SELECT
county_fips_code,
confirmed_cases as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
date = %s AND
county_fips_code IS NOT NULL
''' % DATE)
CDC_COUNTIES_RACE_QUERY = ('''
SELECT
res_state,
res_county,
race_ethnicity_combined,
COUNT(*) as cdc_cases
FROM
%s
GROUP BY
res_county,
res_state,
race_ethnicity_combined
''' % CDC_TABLE)
COUNTY_FIPS_MAPPING_QUERY = ('''
SELECT
*
FROM
`msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''')
ACS_POPULATION_DATA_QUERY = ('''
SELECT
state,
county,
county_fips,
total_pop
FROM
`msm-internal-data.ipums_acs.acs_2019_5year_county`
''')
CDC_STATES_RACE_QUERY = ('''
SELECT
res_state,
race_ethnicity_combined,
COUNT(*) as cdc_cases
FROM
%s
GROUP BY
res_state,
race_ethnicity_combined
''' % CDC_TABLE)
CDC_STATES_RACE_UP_TO_CRDT_QUERY = ('''
SELECT
res_state,
race_ethnicity_combined,
COUNT(*) as cdc_cases
FROM
%s
WHERE
cdc_case_earliest_dt <= %s
GROUP BY
res_state,
race_ethnicity_combined
''' % (CDC_TABLE, CRDT_COMPARE_DATE))
CDC_PUBLIC_GEO_STATES_QUERY = ('''
SELECT
res_state,
COUNT(*) as cdc_public_geo_cases
FROM
%s
GROUP BY
res_state
''' % CDC_PUBLIC_GEO_TABLE)
CDC_PUBLIC_GEO_COUNTIES_RACE_QUERY = ('''
SELECT
res_state,
res_county,
CASE ethnicity = "Non-Hispanic/Latino"
WHEN true THEN race
ELSE ethnicity
END as race_ethnicity_combined,
COUNT(*) as cdc_public_geo_cases
FROM
%s
GROUP BY
res_county,
res_state,
race_ethnicity_combined
''' % CDC_PUBLIC_GEO_TABLE)
CDC_PUBLIC_GEO_STATES_RACE_QUERY = ('''
SELECT
res_state,
CASE ethnicity = "Non-Hispanic/Latino"
WHEN true THEN race
ELSE ethnicity
END as race_ethnicity_combined,
COUNT(*) as cdc_public_geo_cases
FROM
%s
GROUP BY
res_state,
race_ethnicity_combined
''' % CDC_PUBLIC_GEO_TABLE)
#@title
def FieldAnalysis(project_id, table, field_list, calculate_race_ethnicity=False):
dict = {}
for field in field_list:
dict[field] = [0.0, 0.0, 0.0, 0.0]
unknowns = pd.DataFrame(dict, index=['Unknown', 'Missing', 'NA', 'Known'])
field_series = []
value_series = []
percent_series = []
cases_series = []
chart_denominator = 1000000
for field in field_list:
field_unknowns_query = ('''
SELECT
%s,
count(*) as cases
FROM
%s
GROUP BY
%s
''')
if calculate_race_ethnicity and field == 'race_ethnicity_combined':
field_unknowns_query = ('''
SELECT
CASE ethnicity = "Non-Hispanic/Latino"
WHEN true THEN race
ELSE ethnicity
END as %s,
count(*) as cases
FROM
%s
GROUP BY
%s
''')
query = field_unknowns_query % (field, table, field)
field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
field_unknowns_df.set_index(field, inplace=True)
field_unknowns_df.index = field_unknowns_df.index.fillna('Null')
field_display_name = {
'cdc_case_earliest_dt': 'CDC earliest case date',
'current_status': 'Case status',
'res_state': 'State',
'res_county': 'County',
'sex': 'Sex',
'age_group': 'Age',
'race_ethnicity_combined': 'Race/Ethnicity',
'race': 'Race',
'ethnicity': 'Ethnicity',
'case_month': 'Case month'
}
missing_count = 0
if 'Missing' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['Missing'].cases
if 'Null' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['Null'].cases
if '' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc[''].cases
if 'OTH' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['OTH'].cases
if 'nul' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['nul'].cases
unknowns.loc['Missing', field] = missing_count
if 'Unknown' in field_unknowns_df.index:
unknowns.loc['Unknown', field] = field_unknowns_df.loc['Unknown'].cases
if 'NA' in field_unknowns_df.index:
unknowns.loc['NA', field] = field_unknowns_df.loc['NA'].cases
unknowns.loc['Known', field] = field_unknowns_df.cases.sum() - (
unknowns.loc['Missing', field] +
unknowns.loc['Unknown', field] +
unknowns.loc['NA', field])
field_series.extend([field_display_name.get(field, field)] * 4)
value_series.extend(['Known', 'Suppressed', 'Unknown', 'Missing'])
percent_series.extend([unknowns.loc['Known', field] / field_unknowns_df.cases.sum(),
unknowns.loc['NA', field] / field_unknowns_df.cases.sum(),
unknowns.loc['Unknown', field] / field_unknowns_df.cases.sum(),
unknowns.loc['Missing', field] / field_unknowns_df.cases.sum()])
cases_series.extend([unknowns.loc['Known', field] / chart_denominator,
unknowns.loc['NA', field] / chart_denominator,
unknowns.loc['Unknown', field] / chart_denominator,
unknowns.loc['Missing', field] / chart_denominator])
bars = pd.DataFrame.from_dict({'field': field_series,
'value': value_series,
'percent': percent_series,
'cases': cases_series})
return alt.Chart(bars).mark_bar().encode(
x=alt.X('percent', axis=alt.Axis(format='%'), title=''),
y=alt.Y('field', sort='x', title='Field'),
color=alt.Color('value', scale=alt.Scale(scheme='category20'), title='Value'),
order=alt.Order('field:N'),
tooltip=[
alt.Tooltip('field:N', title='Field'),
alt.Tooltip('value:N', title='Value'),
alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
alt.Tooltip('cases:Q', format=',.2f', title='Cases in group (millions)'),
]
)
def CreateNYTStateDataframe(query):
nyt_states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for territory in NYT_TERRITORIES:
nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
nyt_states_df.set_index('state_fips_code', inplace=True)
return nyt_states_df
def CreateCDCStateDataframe(query):
states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
states_df = states_df[states_df.res_state != state]
states_df.rename(columns={'res_state': 'state'}, inplace=True)
states_df['state_fips_code'] = states_df.state
states_df = states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
states_df['state_fips_code'] = states_df.state_fips_code.astype(int)
states_df.set_index('state_fips_code', inplace=True)
return states_df
def CreateCDCStateRaceDataframe(query, cases_field_prefix):
states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
states_df = states_df[states_df.res_state != state]
states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
states_df = states_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_KNOWN_MAP})
states_df.rename(columns={'res_state': 'state'}, inplace=True)
cases_field = cases_field_prefix + 'cases'
crosstab_df = pd.crosstab(states_df['state'],
states_df.race_ethnicity_combined,
values=states_df[cases_field],
aggfunc=sum,
margins=True,
margins_name=cases_field
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df[cases_field_prefix + 'known_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df[cases_field_prefix + 'known_or_na_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df[cases_field_prefix + 'known_cases_percent'] = round(
crosstab_df[cases_field_prefix + 'known_cases'] /
crosstab_df[cases_field], 4)
crosstab_df[cases_field_prefix + 'known_or_na_cases_percent'] = round(
crosstab_df[cases_field_prefix + 'known_or_na_cases'] /
crosstab_df[cases_field], 4)
crosstab_df['state_fips_code'] = crosstab_df.state
crosstab_df = crosstab_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
# Remove territories and missing states for calculating summary stats.
for territory in TERRITORIES:
crosstab_df = crosstab_df[crosstab_df.state != territory]
crosstab_df = crosstab_df[crosstab_df.state != 'NA']
crosstab_df = crosstab_df[crosstab_df.state != 'Missing']
crosstab_df = crosstab_df[crosstab_df.state != 'Unknown']
crosstab_df.set_index('state_fips_code', inplace=True)
return crosstab_df
def CreateCRDTStateRaceDataframe(query):
crdt_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for territory in TERRITORIES:
crdt_df = crdt_df[crdt_df.state != territory]
crdt_df['state_fips_code'] = crdt_df.state
crdt_df = crdt_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
crdt_df['state_fips_code'] = crdt_df.state_fips_code.astype(int)
crdt_df.set_index('state_fips_code', inplace=True)
return crdt_df
def CreateNYTCountyDataframe(query):
nyt_counties_df = pd.io.gbq.read_gbq(NYT_COUNTIES_QUERY, project_id=PROJECT_ID)
nyt_counties_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_df.county_fips.unique()
nyt_counties_df['county_fips'] = nyt_counties_df.county_fips.astype(int)
nyt_counties_df.set_index('county_fips', inplace=True)
return nyt_counties_df
def CreateCDCCountyRaceDataframe(query, cases_field_prefix):
cdc_counties_race_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for territory in TERRITORIES:
cdc_counties_race_df = cdc_counties_race_df[cdc_counties_race_df.res_state != territory]
county_fips_map_df = pd.io.gbq.read_gbq(COUNTY_FIPS_MAPPING_QUERY, project_id=PROJECT_ID)
county_fips_map_df.cdc_county = county_fips_map_df.cdc_county.str.lower()
county_fips_map_df['state_county'] = county_fips_map_df.state + '-' + county_fips_map_df.cdc_county
county_fips_map_df['state_county'] = county_fips_map_df.state_county.astype('string').str.strip()
county_fips_map_df.set_index('state_county', inplace=True)
# Concatenate the state and county names because county names are not unique across states.
cdc_counties_race_df.res_county = cdc_counties_race_df.res_county.str.lower()
cdc_counties_race_df['state_county'] = cdc_counties_race_df.res_state + '-' + cdc_counties_race_df.res_county
cdc_counties_race_df['state_county'] = cdc_counties_race_df.state_county.astype('string').str.strip()
cdc_counties_race_df.set_index('state_county', inplace=True)
cdc_counties_race_df['race_ethnicity_combined'] = cdc_counties_race_df.race_ethnicity_combined.astype('string').str.strip()
cdc_counties_race_df = cdc_counties_race_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_MAP})
# Printed value used in the footnotes below.
# All other checks for county_fips_code mappings are now in
# https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006
mismatches_df = cdc_counties_race_df.join(county_fips_map_df, on="state_county", how='outer', lsuffix='_left', rsuffix='_right')
mismatches_df = mismatches_df[mismatches_df.county_fips.isna()]
mismatches_df = mismatches_df[mismatches_df.res_state != 'NA']
mismatches_df = mismatches_df[mismatches_df.res_state != 'Unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'na']
mismatches_df = mismatches_df[mismatches_df.res_county != 'unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'missing']
# print(mismatches_df.cases.sum())
cdc_counties_race_df = cdc_counties_race_df.join(county_fips_map_df, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')
cases_field = cases_field_prefix + 'cases'
# Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
cdc_counties_race_crosstab_df = pd.crosstab(cdc_counties_race_df['county_fips'],
cdc_counties_race_df.race_ethnicity_combined,
values=cdc_counties_race_df[cases_field],
aggfunc=sum,
margins=True,
margins_name=cases_field
)
# Have to reset_index() to go from pandas multi-index to single index.
cdc_counties_race_crosstab_df = cdc_counties_race_crosstab_df.reset_index()
cdc_counties_race_crosstab_df.drop(axis=0, index=len(cdc_counties_race_crosstab_df) - 1, inplace=True)
cdc_counties_race_crosstab_df['county_fips'] = cdc_counties_race_crosstab_df.county_fips.astype(int)
cdc_counties_race_crosstab_df[cases_field_prefix + 'known_cases'] = (
cdc_counties_race_crosstab_df[cases_field] -
cdc_counties_race_crosstab_df.na_cases.fillna(0) -
cdc_counties_race_crosstab_df.unknown_cases.fillna(0))
# Get the display names for each county.
# Use ACS data that only has one FIPS code per county unlike the fips_county_map.
acs_name_lookup_df = pd.io.gbq.read_gbq(ACS_POPULATION_DATA_QUERY, project_id=PROJECT_ID)
acs_name_lookup_df['state_county'] = (acs_name_lookup_df.county.astype('string').str.strip() +
', ' + acs_name_lookup_df.state.astype('string').str.strip())
acs_name_lookup_df.drop(columns=['state', 'county'], inplace=True)
acs_name_lookup_df.set_index('county_fips', inplace=True)
cdc_counties_race_df = cdc_counties_race_crosstab_df.join(acs_name_lookup_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
cdc_counties_race_df.county_fips = cdc_counties_race_df.county_fips.astype(int)
cdc_counties_race_df.set_index('county_fips', inplace=True)
return cdc_counties_race_df
def CreateScatterPlot(
chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
geo_field = 'state'
geo_field_display_name = 'State'
if geo == 'county':
geo_field = 'state_county'
geo_field_display_name = 'County'
if metric_type == 'ratio':
scale_scheme = 'blueorange'
scale_reverse = True
scale_domain = [0, 2]
legend_format = '.1f'
axis_format = ',.0f'
elif metric_type == 'percent':
scale_scheme = 'redyellowblue'
scale_reverse = False
scale_domain = [0, 1]
legend_format = '.0%'
axis_format = '.0%'
tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
for field in ('y', 'x', 'percent'):
tooltips.append(alt.Tooltip(
fields_dict[field]['name'] + ':Q',
format=fields_dict[field]['format'],
title=fields_dict[field]['title'],
))
plot = alt.Chart(chart_df).mark_circle(size=60).encode(
alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
scale=alt.Scale(domain=(0, scale_max))
),
alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
scale=alt.Scale(domain=(0, scale_max))
),
color=alt.Color(fields_dict['percent']['name'],
type='quantitative',
scale=alt.Scale(scheme=scale_scheme,
reverse=scale_reverse,
domain=scale_domain,
clamp=True),
legend=alt.Legend(format=legend_format),
title=metric_type.capitalize()),
tooltip=tooltips,
).properties(
height=height,
width=width,
)
if metric_type == 'ratio':
plot.interactive()
line = pd.DataFrame({
'x': [0, scale_max],
'y': [0, scale_max],
})
if metric_type == 'ratio':
line_plot = alt.Chart(line).mark_line(color='black').encode(
x='x',
y='y',
)
elif metric_type == 'percent':
line_plot = (
alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
)
# Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
scatter = (plot + line_plot).properties(
title=title,
height=height,
width=width,
).interactive()
return scatter
def CreateMap(
chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
geo_field = 'state'
geo_field_display_name = 'State'
fips_code = 'state_fips_code'
topo_feature = US_STATES_TOPO
if geo == 'county':
geo_field = 'state_county'
geo_field_display_name = 'County'
fips_code = 'county_fips'
topo_feature = US_COUNTIES_TOPO
if metric_type == 'ratio':
scale_scheme = 'blueorange'
scale_reverse = True
scale_domain = [0, 2]
legend_format = '.1f'
elif metric_type == 'percent':
scale_scheme = 'redyellowblue'
scale_reverse = False
scale_domain = [0, 1]
legend_format = '.0%'
highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
for field in ('y', 'x', 'percent'):
tooltips.append(alt.Tooltip(
fields_dict[field]['name'] + ':Q',
format=fields_dict[field]['format'],
title=fields_dict[field]['title'],
))
field_names = [geo_field]
field_names.extend([fields_dict[field]['name'] for field in fields_dict])
plot = alt.Chart(topo_feature).mark_geoshape(
stroke='white',
strokeOpacity=.2,
strokeWidth=1
).project(
type='albersUsa'
).transform_lookup(
lookup='id',
from_=alt.LookupData(chart_df, fips_code, field_names)
).encode(
alt.Color(fields_dict['percent']['name'],
type='quantitative',
legend=alt.Legend(format=legend_format),
scale=alt.Scale(scheme=scale_scheme,
reverse=scale_reverse,
domain=scale_domain,
clamp=True,
),
title=metric_type.capitalize()),
tooltip=tooltips
).add_selection(
highlight,
)
states_outline = alt.Chart(US_STATES_TOPO).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
type='albersUsa'
)
states_fill = alt.Chart(US_STATES_TOPO).mark_geoshape(
fill='silver',
stroke='white'
).project('albersUsa')
layered_map = alt.layer(states_fill, plot, states_outline).properties(
height=height,
width=width,
title=title,
)
return layered_map
def CreateScatterPlotAndMap(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
scatter = CreateScatterPlot(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
map = CreateMap(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
return (scatter | map).configure_view(
strokeWidth=0,
).configure_mark(
stroke='grey'
).configure_legend(
gradientLength=scatter_height - 50
)
def PrintSummaryStats(chart_df, field='percent'):
plus_minus_15_df = chart_df[chart_df[field] >= .85]
plus_minus_15_df = plus_minus_15_df[plus_minus_15_df[field] <= 1.15]
print('between +/-15%: ', len(plus_minus_15_df), round(len(plus_minus_15_df) / len(chart_df), 2))
plus_minus_50_df = chart_df[chart_df[field] >= .50]
plus_minus_50_df = plus_minus_50_df[plus_minus_50_df[field] <= 1.50]
print('between +/-50%: ', len(plus_minus_50_df), round(len(plus_minus_50_df) / len(chart_df), 2))
print('< than .50: ', len(chart_df[chart_df[field] < .5]))
print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
print(chart_df[field].describe())
The Covid Tracking Project was the most reliable source for COVID-19 data with race/ethnicity at the state level until it stopped collecting data on March 7, 2021. The CDC's Case Surveillance Restricted Access and Public Use with Geography datasets could potentially replace the Covid Tracking Project's dataset and additionally enable analyses of age and race/ethnicity along with county-level data. This paper evaluates the completeness of the CDC datasets at the state and county levels in terms of (1) the total number of cases included compared to the New York Times, and (2) the number of cases included with race/ethnicity data compared to the Covid Tracking Project.
The CDC's Restricted Access dataset contains 76% of the cases in the New York Times up to March 16, and 57% of cases have race/ethnicity information vs. 67% in the Covid Tracking Project. At the state and county levels, the dataset's completeness is highly variable; for example, Minnesota has 101% of the cases included in the New York Times, while Louisiana has 5% of the cases in the New York Times. Minnesota has 91% of cases with race/ethnicity, while Louisiana has 19% with race/ethnicity (vs. 94% in the Covid Tracking Project). Texas alone is missing 2.7M cases (more than a third of the total 7.1M missing cases). California is missing race/ethnicity for 2.6M people (more than a quarter of the 9.7M cases missing race/ethnicity) including all 1.5M Hispanic/Latino cases reported on the state public health website. The CDC's Public Use with Geography dataset is similar to the Restricted Access dataset for total case counts, but is less complete due to more privacy suppression; e.g., only 46% of cases have race/ethnicity information.
The CDC datasets are useful for the individual states and counties that have a high degree of data completeness, but we hope that the dataset's completeness will continue to improve across all states and counties.
The racial and ethnic disparities in the COVID-19 pandemic have exposed longstanding health inequities in the U.S., which have been described in multiple analyses of COVID-19 data by the Covid Tracking Project, New York Times, American Public Media Research Lab, and Kaiser Family Foundation among many others. Unfortunately, we still don't have a full understanding of these disparities because race/ethnicity data continue to be fragmented and incomplete. On January 29, the Covid Tracking Project wrote, "the continued lack of either complete federal demographic data or federal guidelines for what states should publish make it impossible to fully understand who is being infected with and dying of COVID-19."
The most reliable and up-to-date data are scattered across state and local public health websites that use different standards and categories for reporting race/ethnicity. In 2020, collecting these data and turning them into a unified dataset was largely left to non-governmental organizations like the Covid Tracking Project, which stopped collecting data on March 7, 2021, a full year after it started. Even the federal government looked to the Covid Tracking Project for reliable COVID-19 race/ethnicity data. The office of the Assistant Secretary for Planning and Evaluation, an agency within the U.S. Department of Health and Human Services, wrote in October 2020, "The volunteer-based COVID tracking project has created the most comprehensive centralized resource for race and ethnicity data at the state level."
The outlook for race/ethnicity data on cases is even bleaker at the county level. Since November 2020, the CDC has shown total case counts at the county level in a dashboard. Before the CDC published that data, several non-governmental organizations (New York Times, Johns Hopkins University, USAFacts) gathered data for total case counts at the county level. But none of these sources collect or publish race/ethnicity data, which would be a huge undertaking due to the non-standard reporting of race/ethnicity across state and local public health websites. The only public analysis of case data with race/ethnicity at the county level was in July 2020 when the New York Times published The Fullest Look Yet at the Racial Inequity of Coronavirus. The New York Times used CDC case surveillance data that they obtained via FOIA and legal action to analyze cases up to May 28, 2020.
Now that the Covid Tracking Project has stopped collecting data, how can we track the disproportionate impact of COVID-19 on communities of color in the U.S. at the state and county levels? There is only one option for public COVID-19 case data with race/ethnicity as a unified dataset across U.S. states and counties: the CDC's case surveillance data, which is based on state and local health departments reporting cases to the CDC. There are more options for data on deaths, which we discuss in a separate deaths data report.
The CDC publishes several variations of this dataset:
These three datasets are from the same underlying source; they have the same number of records and are released at the end of each month. The Public Use dataset was first published in May 2020. The Restricted Access dataset was first published at the link above in November 2020; the dataset itself is in a private GitHub repository that the CDC shares with people who apply for access and adhere to a data use agreement. The CDC's initial restricted access data agreement did not allow for county-level analyses to be made public, but an updated data agreement from December 14, 2020 allowed such analyses. In January 2021, the Morehouse School of Medicine's Satcher Health Leadership Institute (MSM/SHLI) in collaboration with Google.org applied for and got access to this data within a few days. Several months later in March 2021, the CDC published the Public Use with Geography dataset with state and county information, fewer fields, and more data suppression than in the Restricted Access dataset. In addition to these datasets, the CDC also publishes charts showing race/ethnicity breakdowns at the U.S. level in their COVID Data Tracker; see the Appendix for a comparison of that tracker to these datasets.
The CDC's Case Surveillance Restricted Access dataset has enormous potential: It could allow us to analyze data across all states and counties to study the disparities in COVID-19 cases using consistent race/ethnicity categories. This dataset could pick up where the Covid Tracking Project's dataset left off and additionally enable the first analysis of race/ethnicity disparities at the county level since July 2020. The dataset also has age and sex for each case, so we could analyze the intersection of race/ethnicity with age and sex. The dataset has case report dates, which would allow us to look at cases over time. However, the dataset will only live up to its potential if it is complete both in terms of the number of cases included and the number of cases that have race/ethnicity.
Unfortunately, the restricted access dataset has significant completeness issues:
For the 12.8M cases where we do know race/ethnicity, we can see the following disparities across race/ethnicity groups:
#@title
overall_df = pd.io.gbq.read_gbq(CDC_OVERALL_RACE_QUERY, project_id=PROJECT_ID)
overall_df['race_ethnicity_combined'] = overall_df.race_ethnicity_combined.astype('string').str.strip()
overall_df = overall_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_MAP})
overall_df = overall_df.set_index('race_ethnicity_combined')
chart_denominator = 1000000
cases_list = [overall_df.cases['hispanic_cases'] / chart_denominator,
overall_df.cases['black_cases'] / chart_denominator,
overall_df.cases['white_cases'] / chart_denominator,
overall_df.cases['asian_cases'] / chart_denominator,
overall_df.cases['nhpi_cases'] / chart_denominator,
overall_df.cases['aian_cases'] / chart_denominator,
overall_df.cases.sum() / chart_denominator,
]
# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
60481746 / chart_denominator,
40596040 / chart_denominator,
196789401 / chart_denominator,
18427914 / chart_denominator,
565473 / chart_denominator,
2236348 / chart_denominator,
328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
'Hispanic/Latino',
'Black',
'White',
'Asian',
'Native Hawaiian/Pacific Islander',
'American Indian/Alaska Native',
'*Total Including Unknowns*',
], 'percent': percent_list,
'cases': cases_list,
'population': pop_list,
})
bars = alt.Chart(prevalence).mark_bar().encode(
x=alt.X('percent', axis=alt.Axis(format='.1%'), title=''),
y=alt.Y('group', sort='-x', title=''),
color=alt.Color('group',
scale=alt.Scale(scheme='tableau20'),
title='',
legend=None),
tooltip=[
alt.Tooltip('group:N', title='Race/Ethnicity Group'),
alt.Tooltip('percent:Q', format='.2%', title='Prevalence within group'),
alt.Tooltip('cases:Q', format=',.2f', title='Cases in group (millions)'),
alt.Tooltip('population:Q', format=',.2f', title='Population of group (millions)'),
]
).properties(
title='Percent of Race/Ethnicity Group who had COVID-19 based on Incomplete CDC Data up to %s' % DATE_DISPLAY_NAME
)
alt.concat(bars).properties(
title=alt.TitleParams(
['Population data source: The U.S. Census Bureau\'s American Community Survey 2019 5-year estimates.'],
baseline='bottom',
dy=20,
orient='bottom',
fontWeight='normal',
fontSize=11
)
).display()
But the chart above is based on incomplete data. With only 76% of cases included, the total percent of people who had COVID-19 should be 9.0% instead of 6.9%. It's harder to estimate how much the individual race/ethnicity data are undercounting the true number of confirmed COVID-19 cases. For example, the CDC data say that 0% of cases in California were Hispanic/Latino people, whereas the California public health website reports that Hispanics/Latinos made up 55.5% of California cases (1.5M people) as of March 17.
If we added all 9.7M cases with missing race/ethnicity to the Hispanic/Latino group, the percent of Hispanic/Latinos in the U.S. who had COVID-19 would go from 4.3% to 20.2% — a 5x increase. If all 9.7M cases with missing race/ethnicity were Black people, the percent of Black people who had COVID-19 would go from 3.8% to 27.6% — an 7x increase. While these extreme scenarios are unlikely, they show us why missing race/ethnicity data is preventing us from truly understanding and addressing the disparities in the COVID-19 pandemic in the U.S.
At the same time, the data from state public health websites are not perfect; the Covid Tracking Project only has race/ethnicity data for 67% of cases up to March 7. We'll compare data completeness at the state level in the case surveillance data vs. the New York Times and the Covid Tracking Project. If we can understand how complete the data are at the state level in the CDC's dataset, then we can confidently use that dataset to replace the Covid Tracking Project's dataset in certain states and know which states still have more reliable data on their public health websites.
The goal of this analysis is to assess the completeness of the CDC's Restricted Access and Public Use with Geography datasets to evaluate their feasibility in examining disparities in race/ethnicity for COVID-19 cases at the state and county levels. We'll first focus on the Restricted Access dataset, which is more complete than the Public Use with Geography dataset due to privacy reasons.
The overall data completeness findings for the CDC's Restricted Access Dataset are:
#@title
field_list = ['cdc_case_earliest_dt', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race_ethnicity_combined']
FieldAnalysis(PROJECT_ID, CDC_TABLE, field_list).display()
We can evaluate the overall completeness of the CDC datasets with race/ethnicity by calculating at (1) the percent of total case counts compared to the NYT, (2) the percent of cases with race/ethnicity, and then (3) combine those two percentages into a composite that represents the percentage of total expected cases that have race/ethnicity. Later on, we will do this same analysis at the state and county levels.
#@title
# Manually update these fields based on chart above, latest CDC data,
# and improving state/county data below.
overall_row_names = [
'Latest case date',
'Cases in dataset as of date',
'Cases in NYT as of date',
'(as a % of NYT)',
'Cases with race/ethnicity',
'(as a % of cases in dataset)',
'(composite % of NYT total with race/ethnicity)',
]
overall_crdt_metadata = [
'Mar 7, 2021',
'28.8M',
'29.0M',
'(99%)',
'19.2M',
'(67%)',
'(66%)',
]
overall_cdc_metadata = [
'Mar 16, 2021',
'22.5M',
'29.6M',
'(76%)',
'12.8M',
'(57%)',
'(43%)',
]
overall_cdc_public_geo_metadata = [
'Mar 16, 2021',
'22.5M',
'29.6M',
'(76%)',
'10.3M',
'(46%)',
'(35%)',
]
table_data = {'CRDT': overall_crdt_metadata, 'CDC Restricted': overall_cdc_metadata, 'CDC Public Geo': overall_cdc_public_geo_metadata}
metadata_df = pd.DataFrame(table_data, index=overall_row_names)
metadata_df.head(15)
Overall, the CDC datasets' completeness falls short of the CRDT dataset, however there are some states where the two datasets are comparable. We also look at ways to improve both of the CDC's datasets at the state and county levels.
What we didn't include in this report:
The CDC datasets come from a case report form that is a dense, five-page form about each lab-confirmed or probable COVID-19 case (old form prior to Jan 15, 2021). The Restricted Access dataset contains 32 fields, the Public Use with Geography dataset contains 19 fields, and the Public Use version contains 12 fields. In addition to data from the case report form, the CDC separately collects aggregate data that doesn't have race/ethnicity information.
The CDC distinguishes between aggregate data that comes from state and local public health websites vs. line- or case-level data that comes to the CDC from public health departments. The CDC FAQs say that aggregate data are more accurate than case data:
Aggregate counts provide the most up-to-date validated numbers on cases and deaths.
Public health websites often do contain race/ethnicity details, but state websites do not all use the same standard race/ethnicity categories, which makes it hard to collect and aggregate the data. The CRDT captured the many non-standard ways in which different states report on race/ethnicity, where ethnicity is whether a person is Hispanic/Latino. Some states report race/ethnicity as a combined field where each race/ethnicity group is mutually exclusive, which is how the CDC case dataset reports this field. Other states report race/ethnicity as separate fields where Hispanic/Latino people are counted within different race groups as well as in a separate field for ethnicity. States can also differ in terms of which race categories they use, how they define them, whether multiracial people are counted multiple times in different categories, and what's included in the "Other" race category. For more details, see this CRDT analysis.
So, we will need to sacrifice the accuracy and timeliness of aggregate data to get standardized race/ethnicity reporting on cases across all states and counties. However, standardized reporting on race/ethnicity is only useful if we have complete enough data in states and counties.
The CDC has an FAQ about data completeness:
How complete are the data that the CDC receives about COVID-19 cases?
The COVID-19 pandemic has put unprecedented demands on the public health data supply chain. In many states, the large number of COVID-19 cases has severely strained the ability of hospitals, healthcare providers, and laboratories to report cases with complete demographic information, such as race and ethnicity. The unprecedented volume of cases has also limited the ability of state and local health departments to conduct thorough case investigations and collect all requested case data.
As a result, many COVID-19 case notifications submitted to CDC do not have complete information on patient demographics [...] Because it can be time-consuming for jurisdictions to collect the additional information, these data can lag behind the aggregate counts. Because of missing data, analyses of these data elements are likely an underestimate of the true occurrence.
The CDC's COVID Data Tracker says that they are "working with states to provide more information on race/ethnicity for reported cases. The percent of reported cases that include race/ethnicity data is increasing."
We will compare the CDC data against two sources of aggregate data: The NYT and the CRDT's public data, which are aggregated from state and local public health websites. The CRDT is the only source for case data with race/ethnicity breakdowns, but there are several sources for county-level total case counts in addition to the NYT, such as JHU and USAFacts (this paper analyzes the differences between those sources at the state level up to July, 2020 for cases and deaths).
The table below compares geographic vs. race/ethnicity availability for these three different data sources:
#@title
row_names = [
'Total Cases — States',
'Total Cases — Counties',
'Cases by Race/Ethnicity — States',
'Cases by Race/Ethnicity — Counties'
]
nyt_yn = [
'✅',
'✅',
'❌',
'❌',
]
crdt_yn = [
'✅',
'❌',
'✅',
'❌',
]
cdc_yn = [
'✅',
'✅',
'✅',
'✅',
]
table_data = {'NYT': nyt_yn, 'CRDT': crdt_yn, 'CDC': cdc_yn}
availability_df = pd.DataFrame(table_data, index=row_names)
availability_df.head()
Because the CDC is the only data source that has race/ethnicity at the county level, the most similar data for purposes of comparison are (1) NYT data at the state and county levels with no race/ethnicity, and (2) CRDT data at the state level with race/ethnicity.
We will compare against the NYT up to March 16, 2021, which is the latest reporting date in the CDC data. We will compare against the CRDT up to March 7, 2021, which was the last date they reported data. We expect to see small differences (e.g., +/- 15%) in the case counts due to time lags in reporting the data, but time lags cannot explain large percentages of missing cases.
To get a baseline of how much we could expect the CDC case counts to match the NYT or CRDT, we can see how closely the NYT and CRDT match each other. Each dot below is a state (hover to see details), and the black line shows where the NYT and CRDT case counts are equal.
#@title
crdt_states_df = pd.io.gbq.read_gbq(CRDT_QUERY, project_id=PROJECT_ID)
crdt_states_df.set_index('state', inplace=True)
nyt_states_compare_crdt_df = CreateNYTStateDataframe(NYT_STATES_COMPARE_CRDT_QUERY)
crdt_states_df.reset_index(inplace=True)
crdt_states_df['state_fips_code'] = crdt_states_df.state
crdt_states_df = crdt_states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
crdt_states_df.set_index('state_fips_code', inplace=True)
nyt_crdt_counts_df = nyt_states_compare_crdt_df.join(crdt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
nyt_crdt_counts_df['percent'] = round(nyt_crdt_counts_df.crdt_cases / nyt_crdt_counts_df.nyt_cases, 2)
nyt_crdt_counts_df.reset_index(inplace=True)
#@title
nyt_crdt_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT cases'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CRDT to NYT'},
}
nyt_crdt_title = 'Ratio of CRDT to NYT Cases by State up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
nyt_crdt_counts_df, nyt_crdt_fields_dict, nyt_crdt_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(nyt_crdt_counts_df)
The ratio of NYT to CRDT cases is between 0.92 and 1.03 for all states:
We can see below that the CDC Restricted Access dataset case counts differ from the NYT case counts much more drastically than the CRDT did. Note: In the analysis and charts below, we'll refer to the CDC Restricted Access dataset as the "CDC" dataset.
#@title
cdc_states_df = CreateCDCStateDataframe(CDC_STATES_QUERY)
nyt_states_df = CreateNYTStateDataframe(NYT_STATES_QUERY)
cdc_nyt_states_df = cdc_states_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_df.reset_index(inplace=True)
cdc_nyt_states_df['percent'] = round(cdc_nyt_states_df.cdc_cases / cdc_nyt_states_df.nyt_cases, 4)
#@title
cdc_nyt_state_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_state_title = 'Ratio of CDC to NYT Cases by State up to %s' % DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_nyt_states_df, cdc_nyt_state_fields_dict, cdc_nyt_state_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_nyt_states_df)
Texas alone is missing 2.7M cases compared to the total case counts in the NYT data (38% of the 7.1M missing cases across all states).
The ratio of CDC to CRDT cases is between 0.02 and 1.05 for all states + D.C.:
The 29 states that are within +/-15% of the CRDT data could plausibly be off due to time lags in reporting cases to the CDC vs. reporting them on state public health websites, but there are many outlier states that are too far off from the CRDT case counts to be explained by a time lag:
We can do the same analysis at the county level. The CDC case surveillance dataset contains 3,072 counties in the 50 states + D.C., which is 98% of all counties that account for 99.9% of the population.
Each dot is a county (hover to see details). We show all 3,063 counties in the CDC data that were also in the NYT data on the left and zoom in on the smaller counties on the right. Note that the five counties in New York City and four boroughs in Alaska are missing because the NYT combined them into one region for New York City and two combined regions in Alaska; see the Appendix for more details.
#@title
# CDC vs. NYT county
cdc_counties_race_df = CreateCDCCountyRaceDataframe(CDC_COUNTIES_RACE_QUERY, 'cdc_')
nyt_counties_df = CreateNYTCountyDataframe(NYT_COUNTIES_QUERY)
cdc_nyt_counties_race_df = cdc_counties_race_df.join(nyt_counties_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
cdc_nyt_counties_race_df = cdc_nyt_counties_race_df.reset_index()
cdc_nyt_counties_race_df['percent'] = round(cdc_nyt_counties_race_df.cdc_cases / cdc_nyt_counties_race_df.nyt_cases, 2)
cdc_nyt_counties_race_df['cdc_known_cases_percent'] = round(cdc_nyt_counties_race_df.cdc_known_cases /
cdc_nyt_counties_race_df.cdc_cases, 2)
cdc_nyt_counties_race_df['cdc_known_or_na_cases'] = (cdc_nyt_counties_race_df.cdc_known_cases.fillna(0) +
cdc_nyt_counties_race_df.na_cases.fillna(0))
cdc_nyt_counties_race_df['cdc_known_or_na_cases_percent'] = round(cdc_nyt_counties_race_df.cdc_known_or_na_cases /
cdc_nyt_counties_race_df.cdc_cases, 2)
#PrintSummaryStats(cdc_nyt_counties_race_df)
# These values are used in the tables about coverage.
#print(len(cdc_counties_race_df))
#print(len(cdc_counties_race_df) / 3143)
#print(cdc_counties_race_df.total_pop.sum())
#print(cdc_counties_race_df.total_pop.sum() / 324697795) # Population covered in these counties
#print(cdc_counties_race_df.cdc_known_cases.sum())
#print(0.55 * 324697795) # NYT population
#@title
cdc_nyt_county_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_county_title = 'Ratio of CDC to NYT Cases by County up to %s' % DATE_DISPLAY_NAME
zoom_cdc_nyt_title = 'Zoom in on counties with up to 200,000 Cases'
cdc_nyt_county_scatter = CreateScatterPlot(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, COUNTY_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, 'county', 'ratio'
)
cdc_nyt_county_zoom_scatter = CreateScatterPlot(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, zoom_cdc_nyt_title, COUNTY_CASES_ZOOM_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, 'county', 'ratio'
)
(cdc_nyt_county_scatter | cdc_nyt_county_zoom_scatter).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_nyt_counties_race_df)
Harris County, Texas is missing 365K cases compared to the total cases in the NYT data (5% of the 7.1M missing cases across all states).
The ratio of CDC to NYT cases is between 0.00 and 3.2 for the 3,063 counties in the CDC data that were also in the NYT data:
We can also view these ratios on the map on the right and compare them to the state-level totals map above on the left.
#@title
cdc_nyt_states_title = 'Ratio of CDC to NYT Cases by County up to %s' % DATE_DISPLAY_NAME
cdc_nyt_county_map = CreateMap(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'county', 'ratio'
)
cdc_nyt_state_map = CreateMap(
cdc_nyt_states_df, cdc_nyt_state_fields_dict, cdc_nyt_states_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'ratio'
)
(cdc_nyt_state_map | cdc_nyt_county_map).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
Notes:
We can see that the ratio of the CDC case data to NYT aggregate data is highly variable across the U.S., but there is less variability across the counties within each state. This pattern indicates that the data completeness issues may be due to policies or data collection pipelines at the state level rather than the county level. We can also see that some counties are missing entirely from the data; e.g., in Texas and Wyoming. It's possible some of these counties have cases in the data but the county name was suppressed for privacy reasons due to small population sizes. Even so, those cases would still have a state name, and so they would be captured in the map on the left above.
How much race/ethnicity information is available in the CDC data at the state and county levels?
#@title
cdc_states_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_QUERY, 'cdc_')
cdc_states_race_df.reset_index(inplace=True)
cdc_race_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Cases with race/ethnicity'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent cases with race/ethnicity'},
}
cdc_states_race_title = 'CDC Percent of Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_states_race_map = CreateMap(
cdc_states_race_df, cdc_race_fields_dict, cdc_states_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)
cdc_counties_race_title = 'CDC Percent of Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_counties_race_map = CreateMap(
cdc_nyt_counties_race_df, cdc_race_fields_dict, cdc_counties_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'county', 'percent'
)
(cdc_states_race_map | cdc_counties_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(cdc_states_race_df, 'cdc_known_cases_percent')
Note:
California is missing race/ethnicity data for 2.6M cases (75% of California's cases and 27% of the 9.7M total cases missing race/ethnicity).
There's high variability at the state level:
At the county level, many states have similar percentages of cases with race/ethnicity at the county level as they do at the state level, while other states such as Texas, West Virginia, New Mexico, and Missouri have a high variability at the county level. Note that states and counties with small populations will have a higher level of data suppression, which will result in a lower percentage of cases with race/ethnicity.
How does the CDC dataset compare to the CRDT dataset, which is the most up-to-date aggregate dataset for race/ethnicity at the state level up to March 7, 2021? Overall, 67% of the cases in the CRDT data have known race and 59% have known ethnicity. In the CDC data up to March 7, 57% of cases have known race/ethnicity combined.
We will use the race field in the CRDT data as a proxy for a combined race/ethnicity field. For some states, the race field is a combined race/ethnicity field that exactly matches how the CDC Restricted Access dataset reports race/ethnicity. However, the CRDT also captures the many non-standard ways that states report race/ethnicity, as described in this Covid Racial Data Tracker analysis. There's no way to do an exact comparison between the standardized race/ethnicity category in the CDC data and the many ways that race and ethnicity are reported in the CRDT. We use the race field in the CRDT dataset because it's a closer approximation of the combined race/ethnicity field than the ethnicity field.
#@title
cdc_up_to_crdt_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_UP_TO_CRDT_QUERY, 'cdc_')
crdt_df = CreateCRDTStateRaceDataframe(CRDT_QUERY)
cdc_crdt_race_df = cdc_up_to_crdt_race_df.join(crdt_df, on="state_fips_code", how='inner', lsuffix='', rsuffix='_right')
cdc_crdt_race_df.reset_index(inplace=True)
cdc_crdt_race_df['percent'] = round(cdc_crdt_race_df.cdc_known_cases / cdc_crdt_race_df.crdt_known_race_cases, 4)
#@title
crdt_race_fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Cases with race/ethnicity'},
'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT cases'},
'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent cases with race/ethnicity'},
}
crdt_race_title = 'CRDT Percent Cases with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
crdt_race_map = CreateMap(
cdc_crdt_race_df, crdt_race_fields_dict, crdt_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH - 5, 'state', 'percent'
)
cdc_states_race_up_to_crdt_title = 'CDC Percent of Cases with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
cdc_states_race_up_to_crdt_map = CreateMap(
cdc_crdt_race_df, cdc_race_fields_dict, cdc_states_race_up_to_crdt_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)
(cdc_states_race_up_to_crdt_map | crdt_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(cdc_crdt_race_df, 'crdt_known_race_cases_percent')
Note that we only looked at CDC cases up to March 7 when comparing against the CRDT.
For the CRDT:
Overall, the CRDT has a higher percentage of cases with known race/ethnicity than CDC at the state level. Although it appears that the CDC has better data for Texas than the CRDT, the maps above don't account for the fact that the CDC only contains 3% of the cases in the CRDT data. To take that into account, we can compare the number of cases within each state that has known race/ethnicity instead of the percent of cases.
#@title
cdc_crdt_race_fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT cases with race/ethnicity'},
'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC cases with race/ethnicity'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_race_title = 'Ratio of CDC to CRDT Cases with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_crdt_race_df, cdc_crdt_race_fields_dict, cdc_crdt_race_title, CASES_RACE_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH - 5, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_crdt_race_df[cdc_crdt_race_df.state != 'NY'])
Notes:
CRDT has race/ethnicity data for 1.8M more people in California and 627K more people in Florida than the CDC data has.
The ratio of CDC to CRDT cases with race/ethnicity is between 0.01 and 1.04 for all states excluding New York:
Overall, the CRDT is a more complete source for race/ethnicity dataset at the state level than the CDC data in terms of both the counts of cases with race/ethnicity data and the percentage of cases with race/ethnicity data. The only exceptions to this are New York, which has no cases with race/ethnicity and Massachusetts, which has 1.04 times as many cases with race/ethnicity than the CRDT.
The Public Use with Geography dataset offers a way to analyze data at the state and county level without applying for access. The dataset is released on the same day as the Restricted Access dataset and includes the same total number of cases. There are several differences from the Restricted Access dataset:
Due to the increased privacy measures, race/ethnicity is suppressed for 20% of cases vs. 1% in the Restricted Access dataset.
#@title
field_list = ['case_month', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race', 'ethnicity', 'race_ethnicity_combined']
FieldAnalysis(PROJECT_ID, CDC_PUBLIC_GEO_TABLE, field_list, calculate_race_ethnicity=True).display()
We can also see that far fewer counties are included in the dataset, 47% of counties vs. 98% in the Restricted Access dataset, although they are generally counties with large populations, so they still account for 89% percent of the U.S. population.
#@title
# Manually update these fields based on the latest CDC data.
num_counties_row_names = [
'Cases with race/ethnicity',
'(as a % of cases in dataset)',
'Number of counties',
'(as a % of all counties)',
'Population in those counties',
'(as a % of total U.S population – States + D.C.)',
]
num_counties_cdc_metadata = [
'12.8M',
'(57%)',
'3,072',
'(98%)',
'324M',
'(99.9%)',
]
num_counties_cdc_public_geo_metadata = [
'10.3M',
'(46%)',
'1,471',
'(47%)',
'288M',
'(89%)',
]
table_data = {'CDC Restricted': num_counties_cdc_metadata,
'CDC Public Geo': num_counties_cdc_public_geo_metadata,
}
metadata_df = pd.DataFrame(table_data, index=num_counties_row_names)
metadata_df.head(15)
The total case counts for the CDC Public Use with Geography ("CDC Public Geo" below) and CDC Restricted Access dataset ("CDC" below) are similar with only minor differences due to differing privacy restrictions.
#@title
cdc_public_geo_states_df = CreateCDCStateDataframe(CDC_PUBLIC_GEO_STATES_QUERY)
cdc_public_geo_states_df = cdc_states_df.join(cdc_public_geo_states_df, on="state_fips_code", how='inner', lsuffix='', rsuffix='_right')
cdc_public_geo_states_df.reset_index(inplace=True)
cdc_public_geo_states_df['percent'] = round(cdc_public_geo_states_df.cdc_public_geo_cases / cdc_public_geo_states_df.cdc_cases, 4)
#@title
cdc_public_geo_state_fields_dict = {
'x': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
'y': {'name': 'cdc_public_geo_cases', 'format': ',', 'title': 'CDC Public cases'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC Public Geo to NYT'},
}
cdc_public_geo_state_title = 'Ratio of CDC Public Geo to CDC Cases by State up to %s' % DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_public_geo_states_df, cdc_public_geo_state_fields_dict, cdc_public_geo_state_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_public_geo_states_df)
When we compare case counts at the county level, we can see that the Public Geo dataset only contains 1,471 of the 3,072 counties in the Restricted Access dataset. We can also see that these counties are those that have larger populations (at least 20K people and 1K cases).
#@title
cdc_public_geo_counties_race_df = CreateCDCCountyRaceDataframe(CDC_PUBLIC_GEO_COUNTIES_RACE_QUERY, 'cdc_public_geo_')
cdc_public_geo_counties_race_df = cdc_counties_race_df.join(cdc_public_geo_counties_race_df, on="county_fips", how='left', lsuffix='', rsuffix='_right')
cdc_public_geo_counties_race_df = cdc_public_geo_counties_race_df.reset_index()
cdc_public_geo_counties_race_df['percent'] = round(cdc_public_geo_counties_race_df.cdc_public_geo_cases / cdc_public_geo_counties_race_df.cdc_cases, 2)
cdc_public_geo_counties_race_df['cdc_known_cases_percent'] = round(cdc_public_geo_counties_race_df.cdc_known_cases /
cdc_public_geo_counties_race_df.cdc_cases, 2)
cdc_public_geo_counties_race_df['cdc_public_geo_known_cases_percent'] = round(cdc_public_geo_counties_race_df.cdc_public_geo_known_cases /
cdc_public_geo_counties_race_df.cdc_public_geo_cases, 2)
cdc_public_geo_counties_race_df['known_percent'] = round(cdc_public_geo_counties_race_df.cdc_public_geo_known_cases /
cdc_public_geo_counties_race_df.cdc_known_cases, 2)
cdc_public_geo_only_counties_race_df = cdc_public_geo_counties_race_df[cdc_public_geo_counties_race_df.cdc_public_geo_cases.isna() == False]
#PrintSummaryStats(cdc_public_geo_only_counties_race_df)
# These values are used in the tables about coverage.
#print(len(cdc_public_geo_only_counties_race_df))
#print(len(cdc_public_geo_only_counties_race_df) / 3143)
#print(cdc_public_geo_only_counties_race_df.total_pop.sum())
#print(cdc_public_geo_only_counties_race_df.total_pop.sum() / 324697795) # Population covered in these counties
#print(cdc_public_geo_only_counties_race_df.cdc_public_geo_known_cases.sum())
#@title
cdc_public_geo_county_fields_dict = {
'x': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
'y': {'name': 'cdc_public_geo_cases', 'format': ',', 'title': 'CDC Public Geo cases'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CDC Public Geo'},
}
cdc_public_geo_county_title = 'Ratio of CDC Public Geo to CDC Cases by County up to %s' % DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_public_geo_counties_race_df, cdc_public_geo_county_fields_dict, cdc_public_geo_county_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'county', 'ratio'
).display()
#PrintSummaryStats(cdc_public_geo_counties_race_df)
We can also compare the percent of cases with race/ethnicity in the Public Geo dataset on the vs. the Restricted Access dataset.
#@title
cdc_public_geo_states_race_df = CreateCDCStateRaceDataframe(CDC_PUBLIC_GEO_STATES_RACE_QUERY, 'cdc_public_geo_')
cdc_public_geo_states_race_df = cdc_states_race_df.join(cdc_public_geo_states_race_df, on="state_fips_code", how='inner', lsuffix='', rsuffix='_right')
cdc_public_geo_states_race_df.reset_index(inplace=True)
cdc_public_geo_states_race_df['percent'] = round(cdc_public_geo_states_race_df.cdc_public_geo_known_cases / cdc_public_geo_states_race_df.cdc_known_cases, 4)
#@title
cdc_public_geo_states_race_fields_dict = {
'x': {'name': 'cdc_public_geo_known_cases', 'format': ',', 'title': 'Cases with race/ethnicity'},
'y': {'name': 'cdc_public_geo_cases', 'format': ',', 'title': 'CDC Public Geo cases'},
'percent': {'name': 'cdc_public_geo_known_cases_percent', 'format': '.0%', 'title': 'Percent cases with race/ethnicity'},
}
cdc_public_geo_states_race_title = 'CDC Public Geo Percent of Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_public_geo_states_race_map = CreateMap(
cdc_public_geo_states_race_df, cdc_public_geo_states_race_fields_dict, cdc_public_geo_states_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)
(cdc_public_geo_states_race_map | cdc_states_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(cdc_public_geo_states_race_df, field='cdc_public_geo_known_cases_percent')
In most states, the Public Geo dataset has a lower percentage of cases with race/ethnicity information than the Restricted Access dataset.
We can compare these percentages more directly by taking the ratio of cases with race/ethnicity in each dataset at the state level.
#@title
cdc_public_geo_states_race_ratio_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC cases with race/ethnicity'},
'y': {'name': 'cdc_public_geo_known_cases', 'format': ',', 'title': 'CDC Public Geo cases with race/ethnicity'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC Public Geo to CDC'},
}
cdc_public_geo_states_race_ratio_title = 'Ratio of CDC Public Geo to CDC Cases with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_public_geo_states_race_df, cdc_public_geo_states_race_ratio_fields_dict, cdc_public_geo_states_race_ratio_title, 1400000, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_public_geo_states_race_df)
As expected, most states have fewer cases with race/ethnicity information in the Public Geo dataset than in the Restricted Access dataset.
The ratio of CDC Public Geo to CDC Restricted Access cases with race/ethnicity is between 0 and 1.40 for all states:
Interestingly, there are three states that have more cases with race/ethnicity in the Public Geo dataset: Louisiana, Wyoming, and West Virginia. Our best guess to explain this is that it's an unintended consequence of the privacy measures. In states with counties (parishes in Louisiana) with small populations or low case counts, the county names will be suppressed and changed to "NA." Once those cases are combined into an "NA" county, there may be enough cases to exceed the threshold for having at least 11 cases with the unique combination of fields including race and ethnicity, which are separate fields in the Public Geo dataset. In the Restricted Access dataset, those cases will be listed under their individual county names and may not meet the bar for having at least 5 cases with the unique combination of fields including race/ethnicity. So, by pooling together cases from small counties into an "NA" county, it's possible to have race/ethnicity information for more cases in the Public Geo dataset than in the Restricted Access dataset at the state level.
At the county level, we can see that the Public Geo dataset has the same or fewer cases with race/ethnicity information than the Restricted Access dataset in every county.
#@title
cdc_public_geo_race_fields_dict = {
'x': {'name': 'cdc_public_geo_known_cases', 'format': ',', 'title': 'CDC Public Geo cases with race/ethnicity'},
'y': {'name': 'cdc_public_geo_cases', 'format': ',', 'title': 'CDC Public Geo cases'},
'percent': {'name': 'cdc_public_geo_known_cases_percent', 'format': '.0%', 'title': 'Percent cases with race/ethnicity'},
}
cdc_public_geo_counties_race_title = 'CDC Public Geo Percent of Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_public_geo_counties_race_map = CreateMap(
cdc_public_geo_counties_race_df, cdc_public_geo_race_fields_dict, cdc_public_geo_counties_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'county', 'percent'
)
(cdc_public_geo_counties_race_map | cdc_counties_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#@title
cdc_public_geo_counties_race_df['known_percent'] = round(cdc_public_geo_counties_race_df.cdc_public_geo_known_cases /
cdc_public_geo_counties_race_df.cdc_known_cases, 2)
cdc_public_geo_compare_race_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC cases with race/ethnicity'},
'y': {'name': 'cdc_public_geo_known_cases', 'format': ',', 'title': 'CDC Public Geo cases with race/ethnicity'},
'percent': {'name': 'known_percent', 'format': '.2f', 'title': 'Ratio of CDC Public Geo to CDC'},
}
cdc_public_geo_compare_race_title = 'Ratio of CDC Public Geo to CDC Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_public_geo_counties_race_df, cdc_public_geo_compare_race_fields_dict, cdc_public_geo_compare_race_title, COUNTY_CASES_RACE_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH - 5, 'county', 'ratio'
).display()
#PrintSummaryStats(cdc_public_geo_counties_race_df, field='known_percent')
How can we use this dataset at the state and county levels?
When using these datasets, we recommend being transparent about the level of completeness or lack thereof in (1) the total case counts, and (2) the percentage of cases with race/ethnicity. Just like we created a composite measure of the overall datasets along these lines, we do the same at the state and county levels in the charts below. You can think of these as "nutrition facts" labels for each state and county's data.
We also recommend looking at the race/ethnicity breakdowns within each state or county to see if the missing race/ethnicity data is disproportionately from one group; e.g., in the case of California having 0% of cases that are Hispanic/Latino when the state public health website reports 55.5%. If states or counties' data are too incomplete to draw conclusions from, you may want to exclude them entirely from your analyses. You may also find it useful to include unknowns to highlight the incompleteness in the data.
The scatterplots below can help to show the completeness issues in each state and county. The scatterplots show (1) case counts as a percentage of the NYT total case counts on the y-axis, and (2) the percentage of cases with known race/ethnicity on the x-axis. The colors of the dots and states on the map show the composite completeness measure by multiplying those two numbers together, which is the percentage of total expected cases that have race/ethnicity in the datasets (blue is more complete and red is less complete).
We show the datasets in order from most to least data completeness: CRDT, CDC Restricted Access, and CDC Public Geo.
#@title
cdc_crdt_race_df = cdc_up_to_crdt_race_df.join(crdt_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_crdt_race_df.reset_index(inplace=True)
nyt_crdt_counts_df.set_index('state_fips_code', inplace=True)
cdc_crdt_race_df.set_index('state_fips_code', inplace=True)
cdc_crdt_race_df_composite = nyt_crdt_counts_df.join(cdc_crdt_race_df, on="state_fips_code", how='inner', lsuffix='', rsuffix='_right')
nyt_crdt_counts_df.reset_index(inplace=True)
cdc_crdt_race_df_composite.reset_index(inplace=True)
cdc_crdt_race_df_composite['percent'] = round(cdc_crdt_race_df_composite.crdt_cases / cdc_crdt_race_df_composite.nyt_cases, 2)
cdc_crdt_race_df_composite['cases_max_100_percent'] = cdc_crdt_race_df_composite.percent.clip(upper=1)
cdc_crdt_race_df_composite['composite_percent'] = cdc_crdt_race_df_composite.cases_max_100_percent * cdc_crdt_race_df_composite.crdt_known_race_cases_percent
crdt_composite_fields_dict = {
'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'CRDT percent of NYT total cases'},
'x': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'CRDT percent with race/ethnicity'},
'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: CRDT percent of NYT total with race/ethnicity'},
}
crdt_composite_title = 'CRDT Percent of NYT Cases with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
state_composite_map = CreateScatterPlotAndMap(
cdc_crdt_race_df_composite, crdt_composite_fields_dict, crdt_composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'percent'
)
state_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_crdt_race_df_composite, 'composite_percent')
#@title
cdc_nyt_states_race_df = cdc_states_race_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_race_df.reset_index(inplace=True)
cdc_nyt_states_race_df['percent'] = round(cdc_nyt_states_race_df.cdc_cases / cdc_nyt_states_race_df.nyt_cases, 2)
cdc_nyt_states_race_df['cases_max_100_percent'] = cdc_nyt_states_race_df.percent.clip(upper=1)
cdc_nyt_states_race_df['composite_percent'] = cdc_nyt_states_race_df.cases_max_100_percent * cdc_nyt_states_race_df.cdc_known_cases_percent
composite_fields_dict = {
'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'CDC percent of NYT total cases'},
'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: CDC percent of NYT total with race/ethnicity'},
}
composite_title = 'CDC Percent of NYT Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
state_composite_map = CreateScatterPlotAndMap(
cdc_nyt_states_race_df, composite_fields_dict, composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'percent'
)
state_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_nyt_states_race_df, 'composite_percent')
#@title
cdc_public_geo_nyt_states_race_df = cdc_public_geo_states_race_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_public_geo_nyt_states_race_df.reset_index(inplace=True)
cdc_public_geo_nyt_states_race_df['percent'] = round(cdc_public_geo_nyt_states_race_df.cdc_public_geo_cases / cdc_public_geo_nyt_states_race_df.nyt_cases, 2)
cdc_public_geo_nyt_states_race_df['cases_max_100_percent'] = cdc_public_geo_nyt_states_race_df.percent.clip(upper=1)
cdc_public_geo_nyt_states_race_df['composite_percent'] = (
cdc_public_geo_nyt_states_race_df.cases_max_100_percent *
cdc_public_geo_nyt_states_race_df.cdc_public_geo_known_cases_percent)
public_geo_composite_fields_dict = {
'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'CDC Public Geo percent of NYT total cases'},
'x': {'name': 'cdc_public_geo_known_cases_percent', 'format': '.0%', 'title': 'CDC Public Geo percent with race/ethnicity'},
'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: CDC Public Geo percent of NYT total with race/ethnicity'},
}
public_geo_composite_title = 'CDC Public Geo Percent of NYT Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
state_composite_map = CreateScatterPlotAndMap(
cdc_public_geo_nyt_states_race_df, public_geo_composite_fields_dict, public_geo_composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH - 5, 'state', 'percent'
)
state_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_public_geo_nyt_states_race_df, 'composite_percent')
Notes:
To compare these three datasets, we can look at how many states have a composite completeness measure of at least 50% and at least 85%.
#@title
# Manually update these fields based on the latest CDC data.
states_row_names = [
'Number of states with composite >= 50%',
'(as a percent of all states + D.C.)',
'Number of states with composite >= 85%',
'(as a percent of all states + D.C.)',
]
states_crdt_metadata = [
'49',
'(96%)',
'16',
'(31%)',
]
states_cdc_metadata = [
'27',
'(53%)',
'2',
'(4%)',
]
states_cdc_public_geo_metadata = [
'16',
'(31%)',
'0',
'(0%)',
]
table_data = {'CRDT': states_crdt_metadata, 'CDC Restricted': states_cdc_metadata, 'CDC Public Geo': states_cdc_public_geo_metadata}
metadata_df = pd.DataFrame(table_data, index=states_row_names)
metadata_df.head(15)
If we require that states or counties have 50% of the total expected cases with race/ethnicity, we can use 49 states from CRDT, 27 states from CDC Restricted, and only 16 states from CDC Public Geo. If we tighten that requirement to 85% of total expected cases with race/ethnicity, we can only use 16 states from CRDT, two states from CDC Restricted, and no states from CDC Public Geo.
We can look at the same scatterplots and maps at the county level for the two CDC datasets.
#@title
cdc_nyt_counties_race_df['cases_max_100_percent'] = cdc_nyt_counties_race_df.percent.clip(upper=1)
cdc_nyt_counties_race_df['composite_percent'] = cdc_nyt_counties_race_df.cases_max_100_percent * cdc_nyt_counties_race_df.cdc_known_cases_percent
county_composite_map = CreateScatterPlotAndMap(
cdc_nyt_counties_race_df, composite_fields_dict, composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'county', 'percent'
)
county_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_nyt_counties_race_df, field='composite_percent')
#greater_than_85_df = cdc_nyt_counties_race_df[cdc_nyt_counties_race_df['composite_percent'] > .85]
#print('total pop > 85%: ', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = cdc_nyt_counties_race_df[cdc_nyt_counties_race_df['composite_percent'] > .50]
#print('total pop > 50%: ', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
#print('total counties: 3143')
#@title
cdc_public_geo_nyt_counties_race_df = cdc_public_geo_counties_race_df.join(nyt_counties_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
cdc_public_geo_nyt_counties_race_df = cdc_public_geo_nyt_counties_race_df.reset_index()
cdc_public_geo_nyt_counties_race_df['percent'] = round(cdc_public_geo_nyt_counties_race_df.cdc_public_geo_cases / cdc_public_geo_nyt_counties_race_df.nyt_cases, 2)
cdc_public_geo_nyt_counties_race_df['cases_max_100_percent'] = cdc_public_geo_nyt_counties_race_df.percent.clip(upper=1)
cdc_public_geo_nyt_counties_race_df['composite_percent'] = (
cdc_public_geo_nyt_counties_race_df.cases_max_100_percent *
cdc_public_geo_nyt_counties_race_df.cdc_public_geo_known_cases_percent)
county_composite_map = CreateScatterPlotAndMap(
cdc_public_geo_nyt_counties_race_df, public_geo_composite_fields_dict, public_geo_composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH - 5, 'county', 'percent'
)
county_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_public_geo_nyt_counties_race_df, field='composite_percent')
#greater_than_85_df = cdc_public_geo_nyt_counties_race_df[cdc_public_geo_nyt_counties_race_df['composite_percent'] > .85]
#print('total pop > 85%: ', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = cdc_public_geo_nyt_counties_race_df[cdc_public_geo_nyt_counties_race_df['composite_percent'] > .50]
#print('total pop > 50%: ', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
#print('total counties: 3143')
Notes:
At the county level, the Restricted Access dataset is more complete on all measures. However, neither dataset offers very complete data at the county level. If we require that counties have only 50% of the total expected cases with race/ethnicity, the Restricted Access dataset meets that bar for 44% of all counties, and the Public Geo dataset meets that bar for only 2% of all counties.
#@title
# Manually update these fields based on the latest CDC data.
counties_row_names = [
'Number of counties with composite >= 50%',
'(as a percent of all counties)',
'Number of counties with composite >= 85%',
'(as a percent of all counties)',
'Population in counties with composite >= 50%',
'(as a % of total U.S population – States + D.C.)',
'Population in counties with composite >= 85%',
'(as a % of total U.S population – States + D.C.)',
]
counties_crdt_metadata = [
'-',
'-',
'-',
'-',
'-',
'-',
'-',
'-',
]
counties_cdc_metadata = [
'1,401',
'(44%)',
'135',
'(4%)',
'131M',
'(40%)',
'9M',
'(3%)',
]
counties_cdc_public_geo_metadata = [
'495',
'(2%)',
'1',
'(0%)',
'75M',
'(23%)',
'31K',
'(0%)',
]
table_data = {'CRDT': counties_crdt_metadata, 'CDC Restricted': counties_cdc_metadata, 'CDC Public Geo': counties_cdc_public_geo_metadata}
metadata_df = pd.DataFrame(table_data, index=counties_row_names)
metadata_df.head(15)
We have several recommendations for improving the CDC datasets.
Work with states and counties to improve their reporting processes to
A few states seem to have errors where entire race/ethnicity groups are missing from the dataset:
In March 2021, members of the CDC’s Surveillance Review and Response Group said they were aware of these state-specific issues and are actively working to improve them. We hope they are able to do so in the coming months.
Publish a new dataset based on the Public Geo dataset with only state data
We were glad to see the Public Geo dataset's release because it enables faster and easier access to a broader group of people than the Restricted Access dataset. However, once we started looking into the dataset, we saw that the privacy restrictions at the state and county levels resulted in an even less complete version of the Restricted Access dataset. As discussed above, only 31% of states (16 states) had at least 50% of expected cases with race/ethnicity, and only 2% of all counties (495 counties) met that same threshold.
We recommend that the CDC publishes a new dataset based on the Public Geo dataset that either removes the county column entirely or sets all counties to have a blank or "NA" value before suppressing data for privacy. This would create a dataset optimized for use at the state level with less privacy suppression than the current dataset as a result of removing county information.
While this wouldn't help people analyze case data at the county level, it would help create a public dataset that could help fill the gap that the CRDT left behind. Interest in state-level data is higher than for county-level data, and the Public Geo dataset is so incomplete at the county level that we recommend using the Restricted Access dataset for that purpose.
Exclude Unknowns and Missing values from the data suppression procedure
We learned from the Privacy Paper (p. 9) and the Privacy Review that Unknown and Missing values are reclassified to "NA" in the data suppression procedure. However, Unknown and Missing values do not provide any more personally identifiable information than an "NA" does. We recommend preserving the Unknown and Missing values by excluding them as special cases in the data suppression procedure.
While this may seem like a small detail, this would allow us to better evaluate the degree to which states and counties' incomplete race/ethnicity data is due to data suppression vs. truly incomplete data. We could identify small states and counties that may be doing an excellent job with data completeness but have a high level of data suppression. We could break down the percent of cases without race/ethnicity into the percent that is Unknown or Missing vs. the percent that was known and then suppressed. With the current privacy suppression procedure, we can't be sure which NAs were originally Unknowns, Missing, or a known race/ethnicity value.
Fix the county FIPS code field issues
The CDC Restricted dataset includes a county_fips_code field with a unique identifier for each county. However, we used a different mapping to county FIPS codes due to data quality issues. When we used the county_fips_code field provided in the CDC dataset, 43K records with known state and county values had no county_fips_code, including all but five cases in D.C. We created a mapping using the American Community Survey (ACS) 2019 5-year estimates data and then modified the mapping to handle cases of misspellings and other issues in the CDC dataset. We documented the changes and included the new mapping in this spreadsheet.
With the new mapping, we now match all but 1.3K cases with known state and county values to county FIPS codes. We also identified 77 non-existent state-county combinations listed here that the CDC file was matching to county_fips_codes for 780 cases. We no longer match them to any county_fips_codes, but we do report them in the state-level data for that state.
We recognize that the CDC is consolidating data from multiple sources and having to standardize them, as described in their FAQs and Privacy Paper (p. 8). However, anything they can do to reduce the burden on the users of the dataset would be appreciated; currently, we check the Restricted data file each month, perform several checks, and update our mapping as needed. If the county_fips_code field were more reliable, it would make the dataset more accurate and save time for users. Our new mapping allowed us to include 42K more records with county FIPS codes than if we had used the dataset's county FIPS code field. If the county_fips_code field could be cleaned up, that field could also be used for data suppression instead of res_county, which would allow counties that have cases listed under multiple res_county names to be pooled together and possibly result in less data suppression.
The additional fields in the dataset are quite incomplete; whether the person died is known for 53% of cases, whether they were hospitalized is known for 40% of cases, the symptom onset date is known for 45% of cases, and all other fields are known for 29% or fewer of all cases.
#@title
field_list = ['death_yn', 'hosp_yn', 'icu_yn', 'onset_dt', 'pos_spec_dt', 'hc_work_yn',
'pna_yn', 'abxchest_yn', 'acuterespdistress_yn', 'mechvent_yn', 'fever_yn', 'sfever_yn', 'chills_yn', 'myalgia_yn', 'runnose_yn',
'sthroat_yn', 'cough_yn', 'sob_yn', 'nauseavomit_yn', 'headache_yn', 'abdom_yn', 'diarrhea_yn', 'medcond_yn']
FieldAnalysis(PROJECT_ID, CDC_TABLE, field_list).display()
The CDC also commented on these fields in their case data FAQs:
Because of the volume of cases, most health departments are unable to conduct investigations of every case to obtain additional information. Because of this, most case reports are missing data on patient demographics, symptoms, underlying health conditions, characteristics of hospitalizations such as ventilator use, and other factors such as recent travel history.
The case report form contains many more fields, but unfortunately, the fields get more incomplete as you go down the form. Citizens for Responsibility and Ethics in Washington (CREW) obtained a version of this data via FOIA that contained 101 fields with data up to Aug 25, 2020 and shared it with MSM/SHLI. Several of the additional fields from that dataset are shown below; the field with the most known values is whether the case was associated with an outbreak, but even that is only known for 30% of cases.
#@title
field_list = ['death_week', 'icu_length', 'hosp_length', 'translator_yn', 'housing', 'exp_work_critical', 'outbreak_associated',
'rigors_yn', 'taste_yn', 'fatigue_yn', 'wheezing_yn', 'diffbreathing_yn', 'chestpain_yn', 'test_pcr', 'test_serologic',
'exp_adultfacility', 'exp_airport', 'exp_animal', 'exp_community', 'exp_gathering', 'exp_contact', 'exp_correctional',
'exp_ship', 'exp_house', 'exp_other', 'exp_school', 'exp_othcountry', 'exp_unk', 'exp_work']
project_id = 'msm-internal-data'
FieldAnalysis(PROJECT_ID, CREW_TABLE, field_list).display()
The COVID Data Tracker dashboard is the easiest way to see U.S.-level data with race/ethnicity breakdowns for cases, deaths, and over time. The COVID Data Tracker dashboard lists the Public Use case surveillance data as the underlying data source, but there are some unexplained differences. For example, if we compare a snapshot of the COVID Data Tracker dashboard from March 17, the COVID Data Tracker contains 21.5M cases where 52% have race/ethnicity, while the case surveillance data up to March 16 has 22.5M cases where 57% have race/ethnicity. One possible explanation is that more information about race/ethnicity came into the CDC between March 17 and March 31 when the case surveillance datasets were released, but this is just speculation.
It would be helpful if the COVID Data Tracker dashboard had the following documentation, especially for people who are trying to compare the dashboard against the case surveillance datasets:
Has the CDC case surveillance data gotten more complete since the NYT obtained a copy of the case surveillance data in May 2020? Based on the comparison table below, the dataset has improved with more counties included and a higher percent of cases with race/ethnicity and county. Some of that may be due to the fact that there are more counties with more COVID-19 cases since May 2020 and/or the CDC's data suppression procedures have changed since then.
#@title
# Manually update these fields based on the latest CDC data.
# SELECT
# count(*) as count
# FROM `msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20210131`
# https://covidtracking.com/data/national
# County data calculated in Counties: CDC vs. NYT section.
row_names = [
'Update frequency',
'Latest cases date',
'Cases in dataset as of date',
'Cases in NYT as of date',
'(as a % of NYT)',
'Cases with known race/ethnicity and county',
'(as a % of cases in dataset)',
'Number of counties',
'(as a % of all counties)',
'Population in those counties',
'(as a % of total U.S population – States + D.C.)',
]
nyt_cdc_metadata = [
'Once',
'May 28, 2020',
'1.4M',
'1.7M',
'(88%)',
'~0.6M',
'(44%)',
'974',
'(31%)',
'178M',
'(~55%)',
]
cdc_metadata = [
'Monthly',
'Mar 16, 2021',
'22.5M',
'29.6M',
'(76%)',
'12.6M',
'(56%)',
'3,072',
'(98%)',
'324M',
'(99.9%)',
]
table_data = {'NYT/CDC': nyt_cdc_metadata, 'CDC Restricted': cdc_metadata}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
Sources: NYT article and The Daily podcast episode about the article, CTP total case counts for the U.S. by date.
Here is the full comparison between the CRDT, CDC Restricted, and CDC Public Geo datasets. Parts of this table appear in the paper above; this consolidates them into all one place for a full overview.
#@title
table_data = {'CRDT': overall_crdt_metadata + (len(num_counties_cdc_metadata) - 2) * ['-'] + states_crdt_metadata + counties_crdt_metadata,
'CDC Restricted': overall_cdc_metadata + num_counties_cdc_metadata[2:] + states_cdc_metadata + counties_cdc_metadata,
'CDC Public Geo': overall_cdc_public_geo_metadata + num_counties_cdc_public_geo_metadata[2:] + states_cdc_public_geo_metadata + counties_cdc_public_geo_metadata}
metadata_df = pd.DataFrame(table_data, index=overall_row_names + num_counties_row_names[2:] + states_row_names + counties_row_names)
metadata_df.head(25)
To make it easier to hover over small counties, here are larger versions of the county maps that appeared in this report.
#@title
cdc_nyt_county_map_big = CreateMap(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'ratio'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
)
cdc_nyt_county_map_big.display()
#@title
cdc_counties_race_map_big = CreateMap(
cdc_nyt_counties_race_df, cdc_race_fields_dict, cdc_counties_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
)
cdc_counties_race_map_big.display()
#@title
cdc_county_composite_map_big = CreateMap(
cdc_nyt_counties_race_df, composite_fields_dict, composite_title, 1, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
)
cdc_county_composite_map_big.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#@title
#cdc_counties_race_suppressed_map_big = CreateMap(
# cdc_nyt_counties_race_df, cdc_race_suppressed_dict, cdc_counties_race_suppressed_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
#).configure_view(
# strokeWidth=0,
#).configure_legend(
# gradientLength=MAP_HEIGHT - 50
#)
#cdc_counties_race_suppressed_map_big.display()
One geographical exception is that the NYT combined the five separate counties that make up New York City into one geographic unit:
Another geographical exception is that the NYT combined four boroughs (county equivalents) in Alaska as two combined boroughs:
We excluded all of these cases from the county-level maps above.
Please email us at shli-covid-data-analysis@googlegroups.com with questions or comments.
#@title
#%%shell
#jupyter nbconvert --to html 'cdc_case_data.ipynb'